Video

# lm1 <- lm(formula = tenure ~ age + year_num + AB + OPS, data = people_tenure)
# 
# # people_tenure_preds <- people_tenure %>%
# #   mutate(tenure_pred = lm1$fitted.values,
# #          resids = lm1$residuals)
# 
# tidy(lm1)
# 
# yardstick::metrics(data = people_tenure_preds, truth = tenure, estimate = tenure_pred)

# library(GGally)
# 
# people_tenure %>%
#   # select(-c(playerID, yearID, nameFirst, nameLast, birthDate, 
#   #           debut, finalGame, stint, teamID, lgID)) %>%
#   select(tenure, age, year_num, TAB, OPS) %>%
#   ggpairs()
# 
# people_tenure %>%
#   # select(-c(playerID, yearID, nameFirst, nameLast, birthDate, 
#   #           debut, finalGame, stint, teamID, lgID)) %>%
#   select(tenure, G, R, H, X2B, HR, TBB) %>%
#   ggpairs()
# 
# people_tenure %>%
#   # select(-c(playerID, yearID, nameFirst, nameLast, birthDate, 
#   #           debut, finalGame, stint, teamID, lgID)) %>%
#   select(tenure, SB, SO, TB, SlugPct, OBP) %>%
#   ggpairs()
# data.frame(age = c(33),
#            year_num = c(10),
#            TAB = c(600),
#            OPS = c(1)) %>%
#   predict(lm_theory, newdata = .)
# performance::check_outliers(lm_theory)
# Lahman::Fielding %>% head()
# Lahman::Salaries %>% head()
# Lahman::Appearances %>% arrange(desc(yearID)) %>% head()
# Lahman::AllstarFull %>% head()

# fielding <- Lahman::Fielding %>%
#   group_by(playerID, yearID) %>%
#   arrange(desc(G)) %>%
#   mutate(pos_rank = row_number()) %>%
#   ungroup() %>%
#   filter(pos_rank == 1) %>%
#   select(playerID, yearID, POS, G, GS, InnOuts, PO, A, E) %>%
#   anti_join(old_players) %>%
#   rename(G_pos = G)
# 
# appearances <- Lahman::Appearances %>%
#   group_by(playerID, yearID) %>%
#   summarise(G_batting = sum(G_batting),
#             G_defense = sum(G_defense),
#             G_dh = sum(G_dh),
#             G_ph = sum(G_ph))
# 
# people_tenure_all <- people_tenure %>%
#   left_join(fielding, by = c("playerID", "yearID")) %>%
#   replace_na(list(G_pos = 0, GS = 0, InnOuts = 0, PO = 0, A = 0, E = 0)) %>%
#   mutate(fielding_pct = (PO + A) / (PO + A + E)) %>%
#   group_by(POS) %>%
#   mutate(field_pct_scaled = scale(fielding_pct)) %>%
#   ungroup() %>%
#   left_join(appearances, by = c("playerID", "yearID"))



# people_tenure <- people_tenure %>%
#   left_join(appearances, by = c("playerID", "yearID")) %>%
  

To do

General modeling outline

  1. build simple baseline model
  2. build simple blackbox model
  3. analyze residuals from blackbox model
  4. feature engineer to try and correct large residuals
# tidymodels
# library(rsample)
# library(recipes)
# library(parsnip)
# library(tune)
# library(dials)
# library(workflows)
# library(yardstick)
# run this if treesnip not installed: remotes::install_github("curso-r/treesnip")

Most important hyperparameters in xgBoost: https://blog.dataiku.com/narrowing-the-search-which-hyperparameters-really-matter#:~:text=We%20again%20found%20the%20most,be%20seen%20in%20figure%203.

Top 5:

  1. learning rate
  2. subsample
  3. min_child_weight
  4. colsample_bytree
  5. max_depth
# xgb_grid <- grid_latin_hypercube(
#   learn_rate(),
#   sample_size = sample_prop(),
#   min_n(),
#   tree_depth(),
#   trees(),
#   finalize(mtry(), pt_train),
#   loss_reduction(),
#  
#   size = 100
# )
# 
# xgb_grid
# xgb_wf <- workflow() %>%
#   add_formula(tenure ~ year_num + age + TAB + OPS) %>%
#   add_model(xgb_spec)
# 
# xgb_wf
# pt_folds <- vfold_cv(pt_train, v = 5)
# 
# pt_folds
# library(xgboost)
# library(doParallel)
# doParallel::registerDoParallel()
# 
# set.seed(234)
# xgb_res <- tune_grid(
#   xgb_wf,
#   resamples = pt_folds,
#   grid = xgb_grid #,
#   #control = control_grid(save_pred = TRUE)
# )
# 
# xgb_res
# collect_metrics(xgb_res)
# show_best(xgb_res, "rsq")
# best_rsq <- select_best(xgb_res, "rsq")
# best_rsq
# final_xgb <- finalize_workflow(
#   xgb_wf,
#   best_rsq
# )
# 
# final_xgb
# # install.packages("vip")
# library(vip)
# 
# final_xgb %>%
#   fit(data = pt_train) %>%
#   pull_workflow_fit() %>%
#   vip(geom = "col")
# 
# final_res <- last_fit(final_xgb, pt_split)
# 
# collect_metrics(final_res)
# 
# set.seed(123)
# # https://www.tidymodels.org/learn/work/bayes-opt/
# # https://towardsdatascience.com/which-evaluation-metric-should-you-use-in-machine-learning-regression-problems-20cdaef258e
# 
# xgb_set <- parameters(xgb_wf)
# 
# search_res <-
#   xgb_wf %>% 
#   tune_bayes(
#     resamples = pt_folds,
#     # To use non-default parameter ranges
#     param_info = xgb_set,
#     # Generate five at semi-random to start
#     initial = 10,
#     iter = 50,
#     # How to measure performance?
#     metrics = metric_set(rmse),
#     control = control_bayes(no_improve = 30, verbose = TRUE)
#   )
# 
# show_best(search_res, metric = "rmse")
# best_rsq_bayes <- select_best(search_res, "rmse")
# xgb <- boost_tree(
#   # learn_rate = 0.02,          ## step size
#   # sample_size = .5,
#   # min_n = 38,
#   # # mtry = tune(),
#   # tree_depth = 6, 
#   # #colsample_bytree = tune(),
#   # trees = 1000, 
#   # loss_reduction = 0.001,      ## first three: model complexity
#   # #              ## randomness
#   
#     
# ) %>% 
#   set_engine("xgboost") %>% 
#   set_mode("regression")
# 
# xgb_workflow <- workflow() %>%
#   add_formula(tenure ~ year_num + age + TAB + OPS) %>%
#   add_model(xgb)
# 
# xgb_fit <- 
#   xgb_workflow %>%
#   fit(data = pt_train)

#https://rviews.rstudio.com/2019/06/19/a-gentle-intro-to-tidymodels/

# theory_formula <- formula(tenure ~ year_num + age + TAB + OPS)
# 
# xgb_theory <- boost_tree(mode = "regression") %>%
#   set_engine("xgboost") %>%
#   fit(theory_formula, data = pt_train)
# 
# xgb_theory %>%
#   predict(pt_test) %>%
#   bind_cols(pt_test) %>%
#   metrics(truth = tenure, estimate = .pred)
# xgb_spec <- boost_tree(
#   learn_rate = tune(),          ## step size
#   sample_size = tune(),
#   min_n = tune(),
#   # mtry = tune(),
#   tree_depth = tune(), 
#   #colsample_bytree = tune(),
#   trees = tune(), 
#   loss_reduction = tune(),      ## first three: model complexity
#   #              ## randomness
#   
#     
# ) %>% 
#   set_engine("xgboost") %>% 
#   set_mode("regression")
# 
# xgb_spec
# xgb_wf2 <- workflow() %>%
#   add_formula(tenure ~ year_num + age + TAB + OPS + TB + H + G + TBB + X2B + HR + SO + age + X3B + BABIP) %>%
#   add_model(xgb_spec)
# 
# xgb_res2 <- tune_grid(
#   xgb_wf2,
#   resamples = pt_folds,
#   grid = xgb_grid,
#   control = control_grid(save_pred = TRUE)
# )
# 
# show_best(xgb_res2, "rsq")
---
title: "R Notebook"
output: html_notebook
---

# Video

<video width="320" height="240" controls>
  <source src="most-xTB-pujols.mp4" type="video/mp4">
</video>

```{r}
# lm1 <- lm(formula = tenure ~ age + year_num + AB + OPS, data = people_tenure)
# 
# # people_tenure_preds <- people_tenure %>%
# #   mutate(tenure_pred = lm1$fitted.values,
# #          resids = lm1$residuals)
# 
# tidy(lm1)
# 
# yardstick::metrics(data = people_tenure_preds, truth = tenure, estimate = tenure_pred)
```



```{r}

# library(GGally)
# 
# people_tenure %>%
#   # select(-c(playerID, yearID, nameFirst, nameLast, birthDate, 
#   #           debut, finalGame, stint, teamID, lgID)) %>%
#   select(tenure, age, year_num, TAB, OPS) %>%
#   ggpairs()
# 
# people_tenure %>%
#   # select(-c(playerID, yearID, nameFirst, nameLast, birthDate, 
#   #           debut, finalGame, stint, teamID, lgID)) %>%
#   select(tenure, G, R, H, X2B, HR, TBB) %>%
#   ggpairs()
# 
# people_tenure %>%
#   # select(-c(playerID, yearID, nameFirst, nameLast, birthDate, 
#   #           debut, finalGame, stint, teamID, lgID)) %>%
#   select(tenure, SB, SO, TB, SlugPct, OBP) %>%
#   ggpairs()

```



```{r}
# data.frame(age = c(33),
#            year_num = c(10),
#            TAB = c(600),
#            OPS = c(1)) %>%
#   predict(lm_theory, newdata = .)
```



```{r}
# performance::check_outliers(lm_theory)
```


```{r}
# Lahman::Fielding %>% head()
# Lahman::Salaries %>% head()
# Lahman::Appearances %>% arrange(desc(yearID)) %>% head()
# Lahman::AllstarFull %>% head()

```

```{r}

# fielding <- Lahman::Fielding %>%
#   group_by(playerID, yearID) %>%
#   arrange(desc(G)) %>%
#   mutate(pos_rank = row_number()) %>%
#   ungroup() %>%
#   filter(pos_rank == 1) %>%
#   select(playerID, yearID, POS, G, GS, InnOuts, PO, A, E) %>%
#   anti_join(old_players) %>%
#   rename(G_pos = G)
# 
# appearances <- Lahman::Appearances %>%
#   group_by(playerID, yearID) %>%
#   summarise(G_batting = sum(G_batting),
#             G_defense = sum(G_defense),
#             G_dh = sum(G_dh),
#             G_ph = sum(G_ph))
# 
# people_tenure_all <- people_tenure %>%
#   left_join(fielding, by = c("playerID", "yearID")) %>%
#   replace_na(list(G_pos = 0, GS = 0, InnOuts = 0, PO = 0, A = 0, E = 0)) %>%
#   mutate(fielding_pct = (PO + A) / (PO + A + E)) %>%
#   group_by(POS) %>%
#   mutate(field_pct_scaled = scale(fielding_pct)) %>%
#   ungroup() %>%
#   left_join(appearances, by = c("playerID", "yearID"))



# people_tenure <- people_tenure %>%
#   left_join(appearances, by = c("playerID", "yearID")) %>%
  
```


# To do

- analyse big residuals
- feature engineer to try and fix big residuals
- train using xgBoost

# General modeling outline

1. build simple baseline model
2. build simple blackbox model
3. analyze residuals from blackbox model
4. feature engineer to try and correct large residuals




```{r}
# tidymodels
# library(rsample)
# library(recipes)
# library(parsnip)
# library(tune)
# library(dials)
# library(workflows)
# library(yardstick)
# run this if treesnip not installed: remotes::install_github("curso-r/treesnip")

```









Most important hyperparameters in xgBoost: https://blog.dataiku.com/narrowing-the-search-which-hyperparameters-really-matter#:~:text=We%20again%20found%20the%20most,be%20seen%20in%20figure%203.

Top 5:  

1. learning rate 
2. subsample
3. min_child_weight
4. colsample_bytree
5. max_depth
```{r}
# xgb_grid <- grid_latin_hypercube(
#   learn_rate(),
#   sample_size = sample_prop(),
#   min_n(),
#   tree_depth(),
#   trees(),
#   finalize(mtry(), pt_train),
#   loss_reduction(),
#  
#   size = 100
# )
# 
# xgb_grid
```

```{r}
# xgb_wf <- workflow() %>%
#   add_formula(tenure ~ year_num + age + TAB + OPS) %>%
#   add_model(xgb_spec)
# 
# xgb_wf
```

```{r}
# pt_folds <- vfold_cv(pt_train, v = 5)
# 
# pt_folds
```


```{r}
# library(xgboost)
# library(doParallel)
# doParallel::registerDoParallel()
# 
# set.seed(234)
# xgb_res <- tune_grid(
#   xgb_wf,
#   resamples = pt_folds,
#   grid = xgb_grid #,
#   #control = control_grid(save_pred = TRUE)
# )
# 
# xgb_res
```

```{r}
# collect_metrics(xgb_res)
```


```{r}
# show_best(xgb_res, "rsq")
```



```{r}
# best_rsq <- select_best(xgb_res, "rsq")
# best_rsq
```

```{r}
# final_xgb <- finalize_workflow(
#   xgb_wf,
#   best_rsq
# )
# 
# final_xgb
```

```{r}
# # install.packages("vip")
# library(vip)
# 
# final_xgb %>%
#   fit(data = pt_train) %>%
#   pull_workflow_fit() %>%
#   vip(geom = "col")
# 
# final_res <- last_fit(final_xgb, pt_split)
# 
# collect_metrics(final_res)
```













```{r}
# 
# set.seed(123)
# # https://www.tidymodels.org/learn/work/bayes-opt/
# # https://towardsdatascience.com/which-evaluation-metric-should-you-use-in-machine-learning-regression-problems-20cdaef258e
# 
# xgb_set <- parameters(xgb_wf)
# 
# search_res <-
#   xgb_wf %>% 
#   tune_bayes(
#     resamples = pt_folds,
#     # To use non-default parameter ranges
#     param_info = xgb_set,
#     # Generate five at semi-random to start
#     initial = 10,
#     iter = 50,
#     # How to measure performance?
#     metrics = metric_set(rmse),
#     control = control_bayes(no_improve = 30, verbose = TRUE)
#   )
# 
# show_best(search_res, metric = "rmse")
# best_rsq_bayes <- select_best(search_res, "rmse")
```




```{r}
# xgb <- boost_tree(
#   # learn_rate = 0.02,          ## step size
#   # sample_size = .5,
#   # min_n = 38,
#   # # mtry = tune(),
#   # tree_depth = 6, 
#   # #colsample_bytree = tune(),
#   # trees = 1000, 
#   # loss_reduction = 0.001,      ## first three: model complexity
#   # #              ## randomness
#   
#     
# ) %>% 
#   set_engine("xgboost") %>% 
#   set_mode("regression")
# 
# xgb_workflow <- workflow() %>%
#   add_formula(tenure ~ year_num + age + TAB + OPS) %>%
#   add_model(xgb)
# 
# xgb_fit <- 
#   xgb_workflow %>%
#   fit(data = pt_train)

#https://rviews.rstudio.com/2019/06/19/a-gentle-intro-to-tidymodels/

# theory_formula <- formula(tenure ~ year_num + age + TAB + OPS)
# 
# xgb_theory <- boost_tree(mode = "regression") %>%
#   set_engine("xgboost") %>%
#   fit(theory_formula, data = pt_train)
# 
# xgb_theory %>%
#   predict(pt_test) %>%
#   bind_cols(pt_test) %>%
#   metrics(truth = tenure, estimate = .pred)
```




```{r}
# xgb_spec <- boost_tree(
#   learn_rate = tune(),          ## step size
#   sample_size = tune(),
#   min_n = tune(),
#   # mtry = tune(),
#   tree_depth = tune(), 
#   #colsample_bytree = tune(),
#   trees = tune(), 
#   loss_reduction = tune(),      ## first three: model complexity
#   #              ## randomness
#   
#     
# ) %>% 
#   set_engine("xgboost") %>% 
#   set_mode("regression")
# 
# xgb_spec
```




```{r}
# xgb_wf2 <- workflow() %>%
#   add_formula(tenure ~ year_num + age + TAB + OPS + TB + H + G + TBB + X2B + HR + SO + age + X3B + BABIP) %>%
#   add_model(xgb_spec)
# 
# xgb_res2 <- tune_grid(
#   xgb_wf2,
#   resamples = pt_folds,
#   grid = xgb_grid,
#   control = control_grid(save_pred = TRUE)
# )
# 
# show_best(xgb_res2, "rsq")
```


```{r}

```

